package com.guiji.quartz.domain;

import cn.hutool.http.HttpException;
import cn.hutool.http.HttpRequest;
import cn.hutool.http.HttpResponse;
import cn.wanghaomiao.xpath.model.JXDocument;
import com.guiji.quartz.task.DataSaveUtil;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

import java.util.*;
import java.util.regex.Matcher;
import java.util.regex.Pattern;

/**
 * @program: cms-vue-plus
 * @description:
 * @author: wangxiaowen
 * @create: 2021-11-9 13:05
 **/
public class Techcircle {
	private static final Logger log = LoggerFactory.getLogger(Techcircle.class);

	public static void mainMethod() throws Exception {
		HttpResponse response = null;
		try {
			response = HttpRequest.get("https://www.techcircle.in/category/technology")
				.header("Accept", "*/*")
				.header("User-Agent", "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/95.0.4638.69 Safari/537.36")
				.header("X-Requested-With", "XMLHttpRequest")
				.header("Accept-Encoding", "gzip, deflate, br")
				.header("Accept-Language", "en-US,en;q=0.9,zh-CN;q=0.8,zh;q=0.7")
				.setConnectionTimeout(15000)
				.execute();
		} catch (Exception e) {
			log.error("https://www.techcircle.in/category/technology 代理失败1" + e);
			return;
		}
		String resp = response.body();
		Set<String> resUrl = getUrl(resp);
		getContent(resUrl);

	}

	private static void getContent(Set<String> listContent) throws InterruptedException {
		for (String url : listContent) {
			Thread.sleep(1000);
			log.info("getContent执行到-{},listContent长度-{}", url, listContent.size());
			String doc ;
			JXDocument jxDocument = null;
			try {
				if (null != url) {
					url = url.substring(1, url.length() - 1); //去除引号
				}
				HttpResponse response = HttpRequest.get(url)
					.header("User-Agent", "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/95.0.4638.69 Safari/537.36")
					.setConnectionTimeout(15000).execute();
				doc = response.body();
				jxDocument = new JXDocument(doc);
			} catch (HttpException e) {
				log.info("请求详情页失败" + e);
			}
			String title;
			try {
				title = (String) jxDocument.sel("//h1/text()").get(0);
			} catch (Exception e) {
				log.error("标题错误-------------" + url + "-------------------标题错误");
				continue;
			}

			StringBuffer content = new StringBuffer();
			try {
				jxDocument.sel("//div[@class='article-content']//p/text()").forEach(s -> {
					content.append(s + "$$$");
				});
			} catch (Exception e) {
				continue;
			}

			String img ;
			try {
				img = jxDocument.sel("//img/@data-srcset").get(0).toString();
			} catch (Exception e) {
				log.error("获取图片失败");
				continue;
			}

			Date date = new Date();
			List<Object> newImgs = new ArrayList<>();
			if (null != img) {
				newImgs.add(img);
			}


			if (content.length() > 50000 || content.toString().replaceAll("\\$|[\\s\\p{Zs}]","").length() < 20 || newImgs.size() > 20) {
				continue;
			}
			try {
				DataSaveUtil.saveData(title, content.toString(), newImgs, url, date, "https://www.techcircle.in/category/technology", "Techcircle-IndiaTech");
				log.info(Thread.currentThread().getName() + "-----------------techcircle----------------");
			} catch (Exception e) {
				continue;
			}
		}
	}

	private static Set<String> getUrl(String content) {

		Set<String> res = new HashSet<>();
		String pattern = "\"https://www.techcircle.in/[0-9]{4}/[0-9]{2}/[0-9]{2}/.*?\"";
		Pattern p = Pattern.compile(pattern);
		Matcher m = p.matcher(content);
		while (m.find()) {
			res.add(m.group());
		}
		return res;
	}

//	public static void main(String[] args) throws Exception {
//
//		mainMethod();
//
//	}


}

